This notebooks aims at locally training a neural network for sentiment analysis, before deployment on Azure.
We'll compare :
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import nltk
from nltk.stem import snowball
from nltk.corpus import stopwords
import spacy
import gensim.downloader
import gensim.models
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
import tensorflow_text as text
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import keras_tuner as kt
import string
import os
import itertools
from collections import Counter
import re
# Load spacy model for lemmatization
spacy_model = 'en_core_web_lg'
if spacy_model not in spacy.util.get_installed_models():
!{sys.executable} -m spacy download {spacy_model}
# nltk.download('stopwords')
# enabling plots export to html
import plotly
plotly.offline.init_notebook_mode()
%%time
tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None)
tweets = tweets[[5,0]]
tweets.columns = (['text', 'sentiment'])
CPU times: user 2.24 s, sys: 158 ms, total: 2.4 s Wall time: 2.49 s
tweets
| text | sentiment | |
|---|---|---|
| 0 | @switchfoot http://twitpic.com/2y1zl - Awww, t... | 0 |
| 1 | is upset that he can't update his Facebook by ... | 0 |
| 2 | @Kenichan I dived many times for the ball. Man... | 0 |
| 3 | my whole body feels itchy and like its on fire | 0 |
| 4 | @nationwideclass no, it's not behaving at all.... | 0 |
| ... | ... | ... |
| 1599995 | Just woke up. Having no school is the best fee... | 4 |
| 1599996 | TheWDB.com - Very cool to hear old Walt interv... | 4 |
| 1599997 | Are you ready for your MoJo Makeover? Ask me f... | 4 |
| 1599998 | Happy 38th Birthday to my boo of alll time!!! ... | 4 |
| 1599999 | happy #charitytuesday @theNSPCC @SparksCharity... | 4 |
1600000 rows × 2 columns
sample, _ = train_test_split(tweets, train_size=10_000, stratify=tweets['sentiment'], random_state=42)
sample
| text | sentiment | |
|---|---|---|
| 500415 | still sitting under the dryer, my neck hurts | 0 |
| 1577236 | @sarahshah this is my nightmare (even tho i on... | 4 |
| 178111 | @mjvarela black is good... tight, or should I ... | 0 |
| 396033 | Takes forever for everybody to get ready. | 0 |
| 31962 | @Bklyncookie omg all the LA bad weather aura i... | 0 |
| ... | ... | ... |
| 1282270 | @ChristinaNewman pounds are SOO over rated! I ... | 4 |
| 436582 | I'm going to miss dancing this summer. | 0 |
| 552624 | I need a hug...and less cynicism. It's making ... | 0 |
| 443309 | ugh... i have an upset stomach...ugh ... i no ... | 0 |
| 1270172 | At noodleword ! With a couple faggets. Hahah jk | 4 |
10000 rows × 2 columns
# Get training data (80%) and validation + test data (splitted at next step)
train_set, val_test_set = train_test_split(
sample, train_size=0.8, stratify=sample['sentiment'], random_state=42)
# split val_test in validation (10%) and test (10%) set
val_set, test_set = train_test_split(
val_test_set, train_size=0.5, stratify=val_test_set['sentiment'], random_state=42)
del val_test_set
print('train set shape:', train_set.shape)
print('validation set shape:', val_set.shape)
print('test set shape:', test_set.shape)
train set shape: (8000, 2) validation set shape: (1000, 2) test set shape: (1000, 2)
class DataPreprocessor:
'''
Preprocess text according to normalization method (lemmatization, stemming
or keep original form), and optionally embedding.
Process sentiment column into values 0 (for happy tweet) or 1 (for unhappy tweet).
Return dataframe with 1st column for sentiment and other columns for vectors.
'''
def __init__(self, normalization='lem', embedding=None):
if normalization.lower() not in ['lem', 'stem', 'keep']:
raise ValueError('Invalid normalization method. Valid values are'\
' "lem" (Spacy lemmatization), "stem" (nltk stemming)'\
' and "keep" (no transformation).')
self.normalization = normalization
if self.normalization == 'stem':
self.stemmer = snowball.EnglishStemmer()
elif self.normalization == 'lem':
self.nlp = spacy.load(spacy_model)
self.stop_words = stopwords.words('english')
self.embedding = embedding
self.vec_methods = {'word2vec':'word2vec-google-news-300',
'fasttext':'fasttext-wiki-news-subwords-300',
'glove':'glove-twitter-200'}
if self.embedding:
if self.embedding.lower()not in self.vec_methods:
raise ValueError('Invalid embedding method. Valid values are', ', '.join(self.vec_methods.keys()))
self.vectors = self._get_pretrained_vectors()
def _get_pretrained_vectors(self):
'''
Return vectors from self.embedding pretrained model.
'''
model_name = self.vec_methods.get(self.embedding)
print(f'Loading vectors for {self.embedding} model, please wait...')
vectors = gensim.downloader.load(model_name)
print('Vectors loaded.')
return vectors
def _normalize_text(self, input_string):
'''
Return input_string after lowering, deleting stop words / twitter user names /
punctuation / digits / multiple spaces, and stemming or lemmatization
according to self.normalization.
'''
result = input_string
twitter_pattern = "\@\S*"
url_pattern = 'http\S*|www\S*'
punct_pattern = '[' + re.escape(string.punctuation) + '\d]'
for pattern in [twitter_pattern, url_pattern, punct_pattern]:
result = re.sub(pattern, '', result)
result = re.sub('[ ]{2,}', ' ', result)
if self.normalization == 'keep':
result = ' '.join([word for word in result.split() if word not in self.stop_words])
elif self.normalization == 'stem':
result = ' '.join([self.stemmer.stem(word) for word in result.split() if word not in self.stop_words])
elif self.normalization == 'lem':
result = ' '.join([tok.lemma_.lower() for tok in self.nlp(result) if tok.text not in self.stop_words])
return result.strip() or np.NaN
def _embed_sentence(self, sentence):
'''
Return the average vector of all words in the sentence.
'''
sentence_vec = np.zeros((self.vectors.vector_size,))
known_words = [word for word in sentence.split() if word in self.vectors.key_to_index]
if known_words:
sentence_vec = np.mean([self.vectors[word] for word in known_words], axis=0)
return sentence_vec
def _embed_dataset(self, dataframe, sentiment_col='sentiment', text_col='text'):
'''
For given dataframe with columns "text" and "sentiment", return
a dataframe with same nb of rows, with first column = "sentiment"
and next columns = embedding vector for "text". Embedding method
is based on self.embedding.
'''
vec_df = pd.DataFrame(dataframe[text_col].apply(
self._embed_sentence).tolist(), index=dataframe.index)
result = pd.concat((pd.DataFrame(dataframe[sentiment_col]), vec_df), axis=1)
return result
def preprocess_dataset(self, dataframe, sentiment_col='sentiment', text_col='text', pos_label=0):
'''
Return tuple :
- Dataframe for text vectors
- Series for sentiment feature after converting the values appropriately (4--> and 0-->1)
'''
result = dataframe.copy()
result[sentiment_col] = (result[sentiment_col]==pos_label).astype(int)
result[text_col] = result[text_col].apply(self._normalize_text)
result = result.dropna()
if self.embedding:
result = self._embed_dataset(result, sentiment_col=sentiment_col, text_col=text_col)
text_cols = [col for col in result.columns if col != sentiment_col]
return result.loc[:,text_cols], result.loc[:, sentiment_col]
#########################################
############ TESTS ##############
#########################################
dp = DataPreprocessor()
assert np.isnan(dp._normalize_text('123'))
assert dp._normalize_text('keep @mimi2000 http://www.yahoo.fr keep123 www.google.com 78910 keep') == "keep keep keep"
print('All tests passed.')
All tests passed.
Since for us the positive case is the case of negative/unhappy sentiment, we turn the "sentiment" column into expected values:
Text must be cleaned before embedding. We'll remove:
The we'll apply stemming or lemmatization to enhance the model performance. We'll compare performance of both methods through the model result. Here is an example of each preprocessing method:
test_string = "@mimi2000 We, finally!: went to the shopping) 12centers! 34"
print('Test string:')
print(test_string)
print('\nPreprocessed string with lemmatization:')
print(DataPreprocessor(normalization='lem')._normalize_text(test_string))
print('\nPreprocessed string with stemming:')
print(DataPreprocessor(normalization='stem')._normalize_text(test_string))
print('\nPreprocessed string with no stemming/lemmaization:')
print(DataPreprocessor(normalization='keep')._normalize_text(test_string))
Test string:
@mimi2000 We, finally!: went to the shopping) 12centers! 34
Preprocessed string with lemmatization:
Loading vectors for word2vec model, please wait...
Vectors loaded.
we finally go shopping center
Preprocessed string with stemming:
Loading vectors for word2vec model, please wait...
Vectors loaded.
we final went shop center
Preprocessed string with no stemming/lemmaization:
Loading vectors for word2vec model, please wait...
Vectors loaded.
We finally went shopping centers
For our first try, we'll use pre-trained Word2vec English model from Gensim.
dp.embedding
'word2vec'
dp.vectors.similar_by_word('cat')
[('cats', 0.8099379539489746),
('dog', 0.7609456181526184),
('kitten', 0.7464985251426697),
('feline', 0.7326233983039856),
('beagle', 0.7150582671165466),
('puppy', 0.7075453400611877),
('pup', 0.6934289932250977),
('pet', 0.6891530752182007),
('felines', 0.6755931377410889),
('chihuahua', 0.6709762811660767)]
dp.vectors.similar_by_word('dog')
[('dogs', 0.8680490851402283),
('puppy', 0.8106428384780884),
('pit_bull', 0.780396044254303),
('pooch', 0.7627375721931458),
('cat', 0.7609457969665527),
('golden_retriever', 0.7500901222229004),
('German_shepherd', 0.7465173006057739),
('Rottweiler', 0.7437615990638733),
('beagle', 0.7418619990348816),
('pup', 0.7406911253929138)]
To embed whole sentences, we'll average the vectors of each word.
Our function is ready to preprocess each dataset:
print('Preprocessing train set...')
X_train, y_train = dp.preprocess_dataset(train_set)
print('Train set preprocessed.')
print('Preprocessing val set...')
X_val, y_val = dp.preprocess_dataset(val_set)
print('Val set preprocessed.')
print('Preprocessing test set...')
X_test, y_test = dp.preprocess_dataset(test_set)
print('Test set preprocessed.')
Preprocessing train set... Train set preprocessed. Preprocessing val set... Val set preprocessed. Preprocessing test set... Test set preprocessed.
Now that we have cleaned the data, we can create the model:
dp.embedding
'word2vec'
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
lr = LogisticRegression()
lr.fit(X_train, np.ravel(y_train))
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f2score, _ = precision_recall_fscore_support(y_test, y_pred, beta=2, average='binary')
print(f'Accuracy: {accuracy:.2%}\nPrecision: {precision:.2%}\nRecall: {recall:.2%}\nF2-score:{f2score:.2%}')
Accuracy: 71.92% Precision: 72.24% Recall: 71.08% F2-score:71.31%
# Create model
def build_model(activation='tanh'):
nb_hidden_layers = 3
nb_units = 128
dropout_rate = 0.1
learning_rate = 0.01
initializers = {'tanh': keras.initializers.glorot_normal,
'relu': keras.initializers.he_normal,
'selu': keras.initializers.lecun_normal,}
initializer = initializers.get(activation)
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1])))
for _ in range(nb_hidden_layers):
model.add(keras.layers.Dense(nb_units, activation=activation, kernel_initializer=initializer))
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss = "binary_crossentropy",
optimizer=keras.optimizers.SGD(learning_rate=learning_rate),
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
def plot_history(history, height=500):
'''Plot loss, accuracy, precision and recall from history.'''
history_df = pd.DataFrame(history.history)
width = height * 1.5
cols = ['loss', 'accuracy', 'precision', 'recall']
epochs = np.array(history_df.index).astype(int)
colors = dict(zip(cols, ["red", "green", "blue", "goldenrod"]))
fig = go.Figure()
for col in cols:
fig.add_trace(go.Scatter(x=epochs, y=history_df[col],
legendgroup=col,
legendgrouptitle_text=col,
name="train "+col,
mode='lines',
line={'color': colors[col]},
hoverinfo='y'
))
val_col = 'val_' + col
fig.add_trace(go.Scatter(x=epochs, y=history_df[val_col],
legendgroup=col,
name="validation "+col,
mode='lines',
line={'color': colors[col], 'dash':'dot'},
hoverinfo='y'
))
fig.update_layout(width=width, height=height, title="Training results (click on legend items to hide lines)")
fig.update_xaxes(title_text='epochs')
fig.update_yaxes(range=[0,1], title_text='value')
fig.show()
# Create custom function for Tensorboards logfiles
root_logdir = os.path.join(os.curdir, "my_logs")
def get_run_logdir():
import time
run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
return os.path.join(root_logdir, run_id)
run_logdir = get_run_logdir()
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
# Fit model
model = build_model()
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
verbose=0, callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
plot_history(history)
The recall oscillates a lot. Maybe tuning the batch size will help? Let's train the model with different batch sizes, then, for each serie of resulting val_recall, compute its standard deviation:
val_recall_std_for_batch_size = dict()
for batch_size in tqdm(range(32, 513, 32)):
model = build_model()
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
batch_size=batch_size, verbose=0, callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
val_recall_std_for_batch_size.update({batch_size: np.std(history.history['val_recall'])})
val_std_df = pd.DataFrame.from_dict(val_recall_std_for_batch_size, orient='index').reset_index()
val_std_df['index'] = val_std_df['index'].astype(str)
px.bar(val_std_df, x='index', y=0,
title='Standard deviation of val_recall according to batch size', labels={'index': 'batch_size', '0':'val_recall standard deviation'})
Batch size does not seem to be significant to reduce val_recall oscillations. Maybe another activation function may help?
for activation_function in ['relu', 'selu']:
print(f'With {activation_function} activation')
model = build_model(activation_function)
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
verbose=0,
callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
plot_history(history)
With relu activation
With selu activation
We notice that the model converges muche faster with SELU activation function, but it stil oscillates a lot.
We used lemmatization and Word2vec embedding. Let's compare with other normalizing and embedding methods.
# get a bigger sample
sample, _ = train_test_split(tweets, train_size=80_000, stratify=tweets['sentiment'], random_state=42)
# Get training data (80%) and validation + test data (splitted at next step)
train_set, val_test_set = train_test_split(
sample, train_size=0.8, stratify=sample['sentiment'], random_state=42)
# split val_test in validation (10%) and test (10%) set
val_set, test_set = train_test_split(
val_test_set, train_size=0.5, stratify=val_test_set['sentiment'], random_state=42)
del val_test_set
results = pd.DataFrame(columns=['normalization', 'embedding', 'accuracy', 'precision', 'recall', 'f2 score'])
preproc_params = ['stem', 'lem']
vecto_params = ['word2vec', 'glove', 'fasttext']
params = [tup for tup in itertools.product(preproc_params, vecto_params)]
for normalization, embedding in tqdm(params):
# preprocess datasets
print(f'Preprocessing datasets with {normalization} and {embedding}')
dp = DataPreprocessor(normalization=normalization, embedding=embedding)
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
# Create new model
model = build_model('selu')
# Fit model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)],
verbose=0)
print('Model fitted')
# save results
_, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
f2_score = 5 * precision * recall / ((4 * precision)+ recall)
row = pd.Series(dict(zip(results.columns, [normalization,
embedding,
accuracy,
precision,
recall,
f2_score
])))
results = results.append(row, ignore_index=True)
Preprocessing datasets with stem and word2vec Loading vectors for word2vec model, please wait... Vectors loaded. Model fitted Preprocessing datasets with stem and glove Loading vectors for glove model, please wait... Vectors loaded. Model fitted Preprocessing datasets with stem and fasttext Loading vectors for fasttext model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and word2vec Loading vectors for word2vec model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and glove Loading vectors for glove model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and fasttext Loading vectors for fasttext model, please wait... Vectors loaded. Model fitted
results.sort_values(by='f2 score').style.format(dict.fromkeys(results.columns[2:], '{:.2%}'))
| normalization | embedding | accuracy | precision | recall | f2 score | |
|---|---|---|---|---|---|---|
| 1 | stem | glove | 73.44% | 72.14% | 76.41% | 75.52% |
| 3 | lem | word2vec | 74.44% | 73.51% | 76.46% | 75.85% |
| 0 | stem | word2vec | 72.13% | 69.98% | 77.59% | 75.94% |
| 2 | stem | fasttext | 72.18% | 69.82% | 78.19% | 76.36% |
| 5 | lem | fasttext | 74.63% | 73.35% | 77.41% | 76.56% |
| 4 | lem | glove | 75.13% | 73.86% | 77.84% | 77.01% |
Lemmatization with glove embedding seem to be the best combination, we'll use it for next steps. But there is not a huge difference between all combinations.
dp = DataPreprocessor('lem', 'glove')
Loading vectors for glove model, please wait... Vectors loaded.
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
Since the recall is rather wobbly, we won't monitor it for tuning hyperparameters, but val_loss instead.
def build_model(hp):
nb_hidden_layers = hp.Choice('nb_hidden_layers', values=[1, 2, 3, 4, 5, 6])
nb_units = hp.Choice('nb_units', values = [8,16,32,64,128])
initializers = {'tanh': keras.initializers.glorot_normal,
'relu': keras.initializers.he_normal,
'selu': keras.initializers.lecun_normal,}
activation = hp.Choice('activation', values=list(initializers.keys()))
dropout = hp.Boolean('dropout')
dropout_rate = hp.Float("dropout_rate", min_value=0.1, max_value=0.5)
learning_rate = hp.Float("learning_rate", min_value=0.0001, max_value=0.1, sampling='log')
optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1])))
for _ in range(nb_hidden_layers):
model.add(keras.layers.Dense(nb_units, activation=activation,
kernel_initializer=initializers[activation]))
if dropout:
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss = "binary_crossentropy",
optimizer=optimizer(learning_rate=learning_rate),
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
tuner = kt.BayesianOptimization(hypermodel=build_model,
objective=kt.Objective("val_loss",direction='min'),
max_trials=50,
overwrite=True,
directory='my_dir',
project_name='essai')
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(X_train, y_train, validation_data=(X_val, y_val), callbacks=[stop_early],)
Now that the tuner has found good parameters, we can use them in our model:
print('Best parameters:')
for key, value in tuner.get_best_hyperparameters()[0].values.items():
print(key, ':', value)
model = build_model(tuner.get_best_hyperparameters()[0])
history = model.fit(X_train, y_train,
validation_data=(X_val, y_val),
callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)],
epochs=50,
verbose=0)
Best parameters: nb_hidden_layers : 1 nb_units : 32 activation : relu dropout : False dropout_rate : 0.1 learning_rate : 0.0001 optimizer : adam
plot_history(history)
min_recall = min(history.history['val_recall'])
max_recall = max(history.history['val_recall'])
avg_recall = np.mean(history.history['val_recall'])
print(f"Recall is comprised between {min_recall:.2%} and {max_recall:.2%}, with an average of {avg_recall:.2%}.")
Recall is comprised between 71.41% and 79.54%, with an average of 75.96%.
This is a little better than our baseline recall on a simple logistic regression (72%).
We can try to improve our model by using LSTM. To do so, we can't use the vectorized dataset we had till now: instead, we need to input sentences as sequences.
So let's first reprocess our dataset only to get cleaned words:
train_set
| text | sentiment | |
|---|---|---|
| 125864 | just had a shower now watchin sum crap on tv | 0 |
| 1044472 | hitting the town toniiiiight | 4 |
| 239791 | i want to go swimming RIGHT NOW | 0 |
| 1258166 | @erickimberlin owl city announced tour dates f... | 4 |
| 423104 | losecontec 1 harian penuh | 0 |
| ... | ... | ... |
| 1204879 | hiking in Zuma Beach Canyon and Drunken Noodle... | 4 |
| 750462 | @corwin I still need to get a guitarhero/rockb... | 0 |
| 489188 | @pigslove55 a que bonito, you gals are having ... | 0 |
| 625396 | Stupid bus broke down so will be late to googl... | 0 |
| 207403 | My parents & bros. went to the mall withou... | 0 |
64000 rows × 2 columns
val_set
| text | sentiment | |
|---|---|---|
| 1117299 | Just open a twitter account | 4 |
| 933938 | showerr | 4 |
| 1407999 | @oliviaspruill no I AM Tanner! But you can be ... | 4 |
| 165081 | for all the effort and time put into some of t... | 0 |
| 514196 | @irishlad585 haah I've been saying bedtime for... | 0 |
| ... | ... | ... |
| 763071 | @slushy_gutter Hmm, I dunno if he did... he se... | 0 |
| 952839 | says OMG! i'ma buy the couple shirt i saw onli... | 4 |
| 1309207 | Is watching CNN, talking to Katie, & lovin... | 4 |
| 1044675 | @malliboo mmkay sounds good | 4 |
| 480214 | The pool goddess frowned on us! We lost our ne... | 0 |
8000 rows × 2 columns
dp = DataPreprocessor(normalization='lem', embedding=None)
As the model is much lower to train, we'll work on a smaller sample.
small_sample_proportion = 0.15
X_train_raw, y_train = dp.preprocess_dataset(train_set.sample(frac=small_sample_proportion, random_state=42))
X_val_raw, y_val = dp.preprocess_dataset(val_set.sample(frac=small_sample_proportion, random_state=42))
X_test_raw, y_test = dp.preprocess_dataset(test_set.sample(frac=small_sample_proportion, random_state=42))
for dataset in [X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test]:
name = [x for x in globals() if globals()[x] is dataset][0]
filename = name + ".csv"
dataset.to_csv(filename)
for df_name in ['X_train_raw', 'X_val_raw', 'X_test_raw', 'y_train', 'y_val', 'y_test']:
name = df_name +'.csv'
globals()[df_name] = pd.read_csv(name)
X_train_raw
| text | |
|---|---|
| 642229 | credit gone go within phone call |
| 1083523 | go burn burn burn gym |
| 382370 | its poore north wales amp cold |
| 329956 | try check chick fil boston market we oregon |
| 1413806 | may slight soft spot robin hood but happen sat... |
| ... | ... |
| 635700 | shame |
| 969568 | i tweety bird |
| 1266257 | wait get home dinner back hack got to finish s... |
| 544456 | im piss i miss half season season finale ugh fml |
| 574001 | annoy nothing sweet home die chocolate |
9563 rows × 1 columns
y_train
642229 1
1083523 0
382370 1
329956 1
1413806 0
..
635700 1
969568 0
1266257 0
544456 1
574001 1
Name: sentiment, Length: 9563, dtype: int64
X_train_raw['text'].values
array(['credit gone go within phone call', 'go burn burn burn gym',
'its poore north wales amp cold', ...,
'wait get home dinner back hack got to finish strong one',
'im piss i miss half season season finale ugh fml',
'annoy nothing sweet home die chocolate'], dtype=object)
class Encoder:
'''
Tool to create vocabulary from a dataset and encode
a tensor of string into a tensor of voc indices.
'''
def __init__(self, vocab_size=10_000, num_oov_buckets=1000):
self.padding = None
self.table = None
self.vocab_size = vocab_size
self.num_oov_buckets=num_oov_buckets
def fit(self, data):
'''Create vocabulary-to-indices lookup table from data (array of strings).'''
# turn preprocessed text into padded vectors
X_batch = tf.strings.split(data).to_tensor(default_value=b'<pad>')
self.padding = X_batch.shape[1]
# Build vocabulary limited to vocab_size most frequent words
vocabulary = Counter()
for sentence in X_batch:
vocabulary.update(list(sentence.numpy()))
truncated_vocabulary = [item[0]
for item in vocabulary.most_common(self.vocab_size)]
# built lookup table to convert each sentence in tensor of indices
words = tf.constant(truncated_vocabulary)
words_ids = tf.range(len(truncated_vocabulary), dtype='int64')
vocab_init = tf.lookup.KeyValueTensorInitializer(words, words_ids)
self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.num_oov_buckets)
def transform(self, data):
'''Return a tensor of tensors of indices for data.'''
X_batch = tf.strings.split(data).to_tensor(default_value=b'<pad>', shape=[None, self.padding])
return self.table.lookup(X_batch)
encoder = Encoder()
encoder.fit(X_train_raw['text'].values)
X_train = encoder.transform(X_train_raw['text'].values)
X_val = encoder.transform(X_val_raw['text'].values)
X_test = encoder.transform(X_test_raw['text'].values)
# build and train model
def build_lstm_model():
embed_size = 128
model = keras.models.Sequential([
keras.layers.Embedding(encoder.vocab_size+encoder.num_oov_buckets, embed_size, input_shape=[None]),
keras.layers.LSTM(embed_size, return_sequences=True),
keras.layers.LSTM(embed_size),
keras.layers.Dense(1, activation='sigmoid'),
])
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
model = build_lstm_model()
history = model.fit(X_train, y_train,
epochs=15,
callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)],
validation_data=(X_val, y_val))
plot_history(history)
The model converges fast, so let's train it again with epoch=1:
history = model.fit(X_train, y_train,
epochs=1,
validation_data=(X_val, y_val))
299/299 [==============================] - 12s 39ms/step - loss: 0.1970 - accuracy: 0.9285 - precision: 0.9271 - recall: 0.9294 - val_loss: 0.9818 - val_accuracy: 0.6697 - val_precision: 0.6621 - val_recall: 0.6667
loss, accuracy, precision, recall = model.evaluate(X_test, y_test)
print('Accuracy:', '{:.2%}'.format(accuracy))
print('Precision:','{:.2%}'.format(precision))
print('Recall:', '{:.2%}'.format(recall))
38/38 [==============================] - 1s 13ms/step - loss: 0.9020 - accuracy: 0.6931 - precision: 0.7013 - recall: 0.7024 Accuracy: 69.31% Precision: 70.13% Recall: 70.24%
Let's try to improve this result by tuning hyperparameters:
def build_lstm_model(hp):
embed_size = hp.Choice('embed_size', values=[128, 256])
nb_units = hp.Choice('nb_units', values=[1, 2])
dropout = hp.Boolean('dropout')
dropout_rate= hp.Float('dropout_rate', min_value=0.2, max_value=0.5)
learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-1, sampling='log')
model = keras.models.Sequential()
model.add(keras.layers.Embedding(encoder.vocab_size+encoder.num_oov_buckets, embed_size,
input_shape=[None]))
for _ in range(nb_units):
model.add(keras.layers.LSTM(embed_size, return_sequences=True))
if dropout:
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.LSTM(embed_size)),
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
tuner = kt.BayesianOptimization(hypermodel=build_lstm_model,
objective=kt.Objective("val_loss",direction='min'),
max_trials=50,
overwrite=True,
directory='my_dir',
project_name='essai')
tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=50,
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)])
Trial 50 Complete [00h 01m 04s] val_loss: 0.5836336016654968 Best val_loss So Far: 0.5676937103271484 Total elapsed time: 01h 35m 26s INFO:tensorflow:Oracle triggered exit
print('Best parameters:')
for key, value in tuner.get_best_hyperparameters()[0].values.items():
print(key, ':', value)
recalls = []
for _ in tqdm(range(10)):
model = build_lstm_model(tuner.get_best_hyperparameters()[0])
history = model.fit(X_train, y_train,
validation_data=(X_val, y_val),
callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)],
epochs=5,)
recalls.append(history.history['val_recall'][-1])
Best parameters: embed_size : 128 nb_units : 1 dropout : False dropout_rate : 0.2 learning_rate : 0.034964109403531694
Epoch 1/5 299/299 [==============================] - 13s 38ms/step - loss: 0.6261 - accuracy: 0.6437 - precision: 0.6745 - recall: 0.5500 - val_loss: 0.5890 - val_accuracy: 0.6899 - val_precision: 0.7438 - val_recall: 0.5607 Epoch 2/5 299/299 [==============================] - 11s 38ms/step - loss: 0.4532 - accuracy: 0.8051 - precision: 0.8000 - recall: 0.8114 - val_loss: 0.6133 - val_accuracy: 0.6991 - val_precision: 0.7198 - val_recall: 0.6325 Epoch 1/5 299/299 [==============================] - 13s 38ms/step - loss: 0.6175 - accuracy: 0.6480 - precision: 0.6611 - recall: 0.6016 - val_loss: 0.5901 - val_accuracy: 0.7049 - val_precision: 0.6768 - val_recall: 0.7624 Epoch 2/5 299/299 [==============================] - 11s 37ms/step - loss: 0.4476 - accuracy: 0.8084 - precision: 0.8001 - recall: 0.8202 - val_loss: 0.6720 - val_accuracy: 0.6882 - val_precision: 0.6490 - val_recall: 0.7932 Epoch 1/5 299/299 [==============================] - 15s 44ms/step - loss: 0.6842 - accuracy: 0.5293 - precision: 0.5250 - recall: 0.5752 - val_loss: 0.6877 - val_accuracy: 0.5759 - val_precision: 0.8911 - val_recall: 0.1538 Epoch 2/5 299/299 [==============================] - 12s 42ms/step - loss: 0.5669 - accuracy: 0.7183 - precision: 0.7059 - recall: 0.7444 - val_loss: 0.5852 - val_accuracy: 0.7058 - val_precision: 0.7017 - val_recall: 0.6957 Epoch 3/5 299/299 [==============================] - 12s 42ms/step - loss: 0.3910 - accuracy: 0.8398 - precision: 0.8308 - recall: 0.8517 - val_loss: 0.6178 - val_accuracy: 0.6966 - val_precision: 0.6773 - val_recall: 0.7282 Epoch 1/5 299/299 [==============================] - 15s 43ms/step - loss: 0.6475 - accuracy: 0.6002 - precision: 0.5923 - recall: 0.6325 - val_loss: 0.5893 - val_accuracy: 0.7008 - val_precision: 0.7500 - val_recall: 0.5846 Epoch 2/5 299/299 [==============================] - 12s 41ms/step - loss: 0.4717 - accuracy: 0.7850 - precision: 0.7839 - recall: 0.7845 - val_loss: 0.5842 - val_accuracy: 0.6957 - val_precision: 0.7033 - val_recall: 0.6564 Epoch 3/5 299/299 [==============================] - 12s 42ms/step - loss: 0.3337 - accuracy: 0.8700 - precision: 0.8668 - recall: 0.8732 - val_loss: 0.7328 - val_accuracy: 0.6949 - val_precision: 0.6833 - val_recall: 0.7043 Epoch 1/5 299/299 [==============================] - 15s 43ms/step - loss: 0.6483 - accuracy: 0.6003 - precision: 0.6128 - recall: 0.5363 - val_loss: 0.5873 - val_accuracy: 0.6907 - val_precision: 0.6662 - val_recall: 0.7402 Epoch 2/5 299/299 [==============================] - 12s 42ms/step - loss: 0.4677 - accuracy: 0.7912 - precision: 0.7970 - recall: 0.7791 - val_loss: 0.5888 - val_accuracy: 0.7075 - val_precision: 0.7646 - val_recall: 0.5829 Epoch 1/5 299/299 [==============================] - 15s 43ms/step - loss: 0.6515 - accuracy: 0.6070 - precision: 0.5999 - recall: 0.6329 - val_loss: 0.5981 - val_accuracy: 0.6890 - val_precision: 0.7131 - val_recall: 0.6120 Epoch 2/5 299/299 [==============================] - 12s 42ms/step - loss: 0.4769 - accuracy: 0.7864 - precision: 0.7791 - recall: 0.7969 - val_loss: 0.7425 - val_accuracy: 0.6840 - val_precision: 0.6516 - val_recall: 0.7641 Epoch 1/5 299/299 [==============================] - 15s 43ms/step - loss: 0.6409 - accuracy: 0.6173 - precision: 0.6703 - recall: 0.4555 - val_loss: 0.5961 - val_accuracy: 0.6806 - val_precision: 0.6483 - val_recall: 0.7624 Epoch 2/5 299/299 [==============================] - 13s 43ms/step - loss: 0.4652 - accuracy: 0.7930 - precision: 0.8042 - recall: 0.7722 - val_loss: 0.5746 - val_accuracy: 0.7091 - val_precision: 0.7080 - val_recall: 0.6923 Epoch 3/5 299/299 [==============================] - 13s 42ms/step - loss: 0.3373 - accuracy: 0.8685 - precision: 0.8693 - recall: 0.8660 - val_loss: 0.7197 - val_accuracy: 0.6832 - val_precision: 0.7058 - val_recall: 0.6068 Epoch 1/5 299/299 [==============================] - 15s 43ms/step - loss: 0.6506 - accuracy: 0.6099 - precision: 0.6186 - recall: 0.5647 - val_loss: 0.6299 - val_accuracy: 0.6832 - val_precision: 0.6838 - val_recall: 0.6581 Epoch 2/5 299/299 [==============================] - 13s 42ms/step - loss: 0.5293 - accuracy: 0.7576 - precision: 0.7568 - recall: 0.7562 - val_loss: 0.6051 - val_accuracy: 0.7091 - val_precision: 0.7102 - val_recall: 0.6872 Epoch 3/5 299/299 [==============================] - 13s 42ms/step - loss: 0.4074 - accuracy: 0.8348 - precision: 0.8275 - recall: 0.8442 - val_loss: 0.6543 - val_accuracy: 0.7033 - val_precision: 0.6890 - val_recall: 0.7197 Epoch 1/5 299/299 [==============================] - 15s 44ms/step - loss: 0.6556 - accuracy: 0.6026 - precision: 0.6356 - recall: 0.4733 - val_loss: 0.5882 - val_accuracy: 0.7117 - val_precision: 0.7163 - val_recall: 0.6821 Epoch 2/5 299/299 [==============================] - 13s 42ms/step - loss: 0.4863 - accuracy: 0.7786 - precision: 0.7777 - recall: 0.7778 - val_loss: 0.5955 - val_accuracy: 0.7041 - val_precision: 0.7164 - val_recall: 0.6564 Epoch 1/5 299/299 [==============================] - 15s 44ms/step - loss: 0.6922 - accuracy: 0.5127 - precision: 0.5111 - recall: 0.4924 - val_loss: 0.6930 - val_accuracy: 0.5096 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 Epoch 2/5 299/299 [==============================] - 13s 42ms/step - loss: 0.6188 - accuracy: 0.6390 - precision: 0.6536 - recall: 0.5853 - val_loss: 0.6081 - val_accuracy: 0.6848 - val_precision: 0.7518 - val_recall: 0.5333 Epoch 3/5 299/299 [==============================] - 13s 43ms/step - loss: 0.4470 - accuracy: 0.8070 - precision: 0.8063 - recall: 0.8060 - val_loss: 0.5854 - val_accuracy: 0.7158 - val_precision: 0.7228 - val_recall: 0.6821 Epoch 4/5 299/299 [==============================] - 13s 42ms/step - loss: 0.3222 - accuracy: 0.8764 - precision: 0.8751 - recall: 0.8769 - val_loss: 0.6237 - val_accuracy: 0.6865 - val_precision: 0.6738 - val_recall: 0.6991
px.box([item[-1] for item in recalls], labels={'variable': 'recall'}, range_y=[0,1], title="Distribution of recall on validation set")
loss, accuracy, precision, recall = model.evaluate(X_test, y_test)
print('Accuracy:', '{:.2%}'.format(accuracy))
print('Precision:','{:.2%}'.format(precision))
print('Recall:', '{:.2%}'.format(recall))
38/38 [==============================] - 0s 11ms/step - loss: 0.6160 - accuracy: 0.7065 - precision: 0.7122 - recall: 0.7203 Accuracy: 70.65% Precision: 71.22% Recall: 72.03%
This model is not much better than the previous and its training on a much smaller dataset (15% of data) is much longer (about 6 times).
We'll use the standard BERT model with 12 hidden layers, 768 neurons per layer and 12 attention heads.
The text needs to be preprocessed and encoded to be fed into neural BERT neural network, so we'll also use the appropriated preprocessor.
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
Since BERT has its own proprocessing and can take in account punctuation, conjugated forme, etc, we clean the text in the simplest way, just removing twitter names and urls (no stemming / lemmatization).
small_sample_frac = 0.10
small_train_set = train_set.sample(frac=small_sample_frac, random_state=42)
small_val_set = val_set.sample(frac=small_sample_frac, random_state=42)
small_test_set = test_set.sample(frac=small_sample_frac, random_state=42)
X_train = tf.strings.regex_replace(small_train_set['text'], '\@\S*|http\S*|www\S*', '')
X_val = tf.strings.regex_replace(small_val_set['text'], '\@\S*|http\S*|www\S*', '')
X_test = tf.strings.regex_replace(small_test_set['text'], '\@\S*|http\S*|www\S*', '')
y_train = (small_train_set['sentiment'].values==0).astype(int)
y_val = (small_val_set['sentiment'].values==0).astype(int)
y_test = (small_test_set['sentiment'].values==0).astype(int)
Let's preprocess example sentences:
sentence_1 = "Data Science is the best invention since sliced bread."
sentence_2 = "To improve is to change, so to be perfect is to change often."
preprocessed_sentences = bert_preprocess_model([sentence_1, sentence_2])
for key in preprocessed_sentences.keys():
print(key)
print(preprocessed_sentences[key][:, :20])
print('*'*20)
input_word_ids
tf.Tensor(
[[ 101 2951 2671 2003 1996 2190 11028 2144 15920 7852 1012 102
0 0 0 0 0 0 0 0]
[ 101 2000 5335 2003 2000 2689 1010 2061 2000 2022 3819 2003
2000 2689 2411 1012 102 0 0 0]], shape=(2, 20), dtype=int32)
********************
input_type_ids
tf.Tensor(
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(2, 20), dtype=int32)
********************
input_mask
tf.Tensor(
[[1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]], shape=(2, 20), dtype=int32)
********************
Now let's test what our BERT model can do with our preprocessed sentences:
results = bert_encoder(preprocessed_sentences)
results.keys()
dict_keys(['encoder_outputs', 'default', 'pooled_output', 'sequence_output'])
for key in ['default', 'pooled_output', 'sequence_output']:
print(key)
print('shape:', results[key].shape)
print(results[key])
print('*'*20)
default
shape: (2, 768)
tf.Tensor(
[[-0.9512843 -0.6032508 -0.9545547 ... -0.874434 -0.7910525
0.9142777 ]
[-0.9189594 -0.56259525 -0.9720294 ... -0.9071247 -0.7950544
0.87461 ]], shape=(2, 768), dtype=float32)
********************
pooled_output
shape: (2, 768)
tf.Tensor(
[[-0.9512843 -0.6032508 -0.9545547 ... -0.874434 -0.7910525
0.9142777 ]
[-0.9189594 -0.56259525 -0.9720294 ... -0.9071247 -0.7950544
0.87461 ]], shape=(2, 768), dtype=float32)
********************
sequence_output
shape: (2, 128, 768)
tf.Tensor(
[[[-0.10590196 0.26851505 -0.14423418 ... -0.35868776 0.67331445
0.41718978]
[-0.36307073 -0.40274557 0.22424859 ... -0.3605184 0.5813349
0.5045001 ]
[-0.56774175 -0.10595442 -0.08213287 ... -1.0117962 0.18640223
0.09948587]
...
[-0.07537924 -0.4467767 0.13783732 ... 0.42594665 0.49459484
-0.03347448]
[ 0.05718884 -0.10596795 0.23254201 ... 0.15991673 0.21175867
0.20602745]
[ 0.04381774 -0.08595694 0.2329374 ... 0.10957312 0.16571827
0.19322857]]
[[ 0.03066833 0.09868553 -0.6011374 ... -0.35253045 0.36468536
0.76799 ]
[ 0.5709117 0.72137654 -0.70406556 ... 0.78116554 1.0891954
0.53473175]
[ 0.43270245 0.15248676 0.21829745 ... -0.56087095 0.6071634
-1.4340926 ]
...
[-0.24884136 -0.541174 -0.09731398 ... 0.5463477 0.29288173
-0.00449529]
[ 0.3623999 0.0258242 0.27541623 ... 0.04379031 0.00182082
-0.33718944]
[ 0.3851114 0.00677254 0.29776073 ... 0.06883734 -0.01784195
-0.3610733 ]]], shape=(2, 128, 768), dtype=float32)
********************
for each sentence:
Now we can build the classification model:
def build_bert_model():
# preprocessing layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessor = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
preprocessed_text = preprocessor(text_input)
# encoding layer
encoder = hub.KerasLayer(tfhub_handle_encoder, name='BERT_encoder')
encoded = encoder(preprocessed_text)['pooled_output']
# classifier layer
x = tf.keras.layers.Dropout(0.1)(encoded)
output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(x)
return tf.keras.Model(text_input, output)
bert_model = build_bert_model()
bert_model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
history = bert_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5)
Epoch 1/5 200/200 [==============================] - 795s 4s/step - loss: 0.6578 - accuracy: 0.6077 - precision: 0.6055 - recall: 0.5930 - val_loss: 0.6355 - val_accuracy: 0.6875 - val_precision: 0.8081 - val_recall: 0.5252 Epoch 2/5 200/200 [==============================] - 789s 4s/step - loss: 0.6364 - accuracy: 0.6367 - precision: 0.6395 - recall: 0.6082 - val_loss: 0.6106 - val_accuracy: 0.7188 - val_precision: 0.7202 - val_recall: 0.7530 Epoch 3/5 200/200 [==============================] - 787s 4s/step - loss: 0.6212 - accuracy: 0.6650 - precision: 0.6697 - recall: 0.6367 - val_loss: 0.5972 - val_accuracy: 0.7212 - val_precision: 0.7215 - val_recall: 0.7578 Epoch 4/5 200/200 [==============================] - 773s 4s/step - loss: 0.6076 - accuracy: 0.6687 - precision: 0.6723 - recall: 0.6442 - val_loss: 0.5869 - val_accuracy: 0.7312 - val_precision: 0.7538 - val_recall: 0.7194 Epoch 5/5 200/200 [==============================] - 775s 4s/step - loss: 0.5980 - accuracy: 0.6889 - precision: 0.6945 - recall: 0.6622 - val_loss: 0.5847 - val_accuracy: 0.7100 - val_precision: 0.6735 - val_recall: 0.8609
plot_history(history)
As another baseline, below is a pretrained model specialized in binary sentiment analysis (source):
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
print(sentiment_analysis("The place is fine but the staff is not helpful."))
[{'label': 'NEGATIVE', 'score': 0.9994140863418579}]
Let's apply it to our test set:
y_pred = X_test_raw['text'].progress_apply(lambda text: int(sentiment_analysis(text)[0]['label']=='NEGATIVE'))
precision, recall, f2_score, _ = precision_recall_fscore_support(y_test, y_pred, beta=2, average='binary')
print('Precision:','{:.2%}'.format(precision))
print('Recall:', '{:.2%}'.format(recall))
print('F2 score:', '{:.2%}'.format(f2_score))
Precision: 77.60% Recall: 54.63% F2 score: 58.07%